## Replication code for Ahlquist, Mayer, and Jackman (2014), "Alien Abduction and Voter
## Impersonation in the 2012 U.S. General Election: Evidence from a Survey List
## Experiment." Election Law Journal 13 (4): 460-475. 

## Version control: 
## R version 3.1.3 "Smooth Sidewalk"
## RStudio version 0.98.1091
## Platform: x86_64-apple-darwin10.8.0 (64-bit)
## Last updated 19-MAY-2015

#### SETUP ####
rm(list=ls())
library(foreign); library(list); library(sandwich); library(ggplot2); library(Amelia); library(texreg)
setwd("./AMJ_Replication_Final")
## Get extract function for ictreg
source("extractictreg.R")
## A function for the BIC from ICT regression
ict.bic <- function(model){
  bic <- -2*model$llik + log(dim(model$x)[1])*dim(model$vcov)[1]
  print(bic)
}
set.seed(pi)
## Import Wave 1 data
data <- read.csv(file="AMJ_wave_1_topost.csv")

##### PREPARE DATA FOR ANALYSIS ####
#### Group-level covariates
## Create measures of competitivenes. Battleground = presidential victor margin <5%; contested = margin < 7%; 
## sen.contested is Senate races <5%; and gov.contested is gubernatorial <5%.
data$battleground <- 0
data$contested <- 0
data$contested.sl <- 0
data$contested.max <- 0
battleground.states <- c("Florida", "Ohio", "North Carolina", "Virginia")
contested.states <- c(battleground.states, c("Wisconsin", "Colorado", "Nevada", 
                                             "Iowa", "Pennsylvania", "New Hampshire"))
sen.contested.states <- c("Nevada", "Arizona", "Indiana", "Montana", "North Dakota")
gov.contested.states <- c("Washington", "Indiana", "Montana", "West Virginia")
contested.max <- c(battleground.states, sen.contested.states, gov.contested.states)
contested.sl <- c(sen.contested.states, gov.contested.states)
data$battleground[data$inputstate %in% battleground.states] <- 1
data$contested[data$inputstate %in% contested.states] <- 1
data$contested.sl[data$inputstate %in% contested.sl] <- 1
data$contested.max[data$inputstate %in% contested.max] <- 1
rm(battleground.states,contested.states,sen.contested.states,
   gov.contested.states,contested.max,contested.sl)

## Code Voter ID laws, according to NCSL http://www.ncsl.org/legislatures-elections/elections/voter-id.aspx
## 0 = "no voter ID law", 1 = "non-photo ID", 2 = "strict non-photo", 3 = "photo ID", 4 = "strict photo ID"
data$state.voter.id <- 0
data$strict.id <- 0
data$voter.id.law <- 0
data$state.voter.id[data$inputstate %in% c("Alabama", "Alaska", "Arkansas", "Colorado", "Connecticut", 
                                           "Deleware", "Kentucky", "Missouri", "Montana",
                                           "North Dakota", "Oklahoma", "Rhode Island", "South Carolina",
                                           "Texas", "Utah", "Washington")] <- 1
data$state.voter.id[data$inputstate %in% c("Arizona", "Ohio", "Virginia")] <- 2
data$state.voter.id[data$inputstate %in% c("Florida", "Hawaii", "Idaho", "Louisiana", 
                                           "Michigan", "New Hampshire", "South Dakota")] <- 3
data$state.voter.id[data$inputstate %in% c("Georgia", "Indiana", "Kansas", "Tennessee")] <- 4
data$strict.id[data$inputstate%in% c("Arizona", "Ohio", "Virginia", "Georgia", "Indiana",
                                     "Kansas", "Tennessee")] <- 1
data$voter.id.law[data$state.voter.id > 0] <- 1
## Convert state voter ID into indicator variables
data$nph.id <- model.matrix(~as.factor(state.voter.id),data)[,2]
data$snph.id <- model.matrix(~as.factor(state.voter.id),data)[,3]
data$ph.id <- model.matrix(~as.factor(state.voter.id),data)[,4]
data$sph.id <- model.matrix(~as.factor(state.voter.id),data)[,5]

# Code states with election day registration 2012
data$edr <- 0
data$edr[data$inputstate %in% c("Idaho", "Iowa", "Maine", "Minnesota", "Montana",
                                "New Hampshire", "Wisconsin", "Wyoming")] <- 1

## Code states with all mail voting
data$amv <- 0
data$amv[data$inputstate %in% c("Oregon", "Washington")] <- 1

## Code absentee voting. Civilian absentee ballots transmitted as % of total 
## people participating in 2010 election. c1a/f1a by state.  "-999999" counted as 0 
## http://www.eac.gov/research/election_administration_and_voting_survey.aspx
abvo <- read.csv("ABV.csv", header=T)
data$absentee10 <- abvo$absentee.transmit.pct[match(data$inputstate, abvo$State)]
## ...And for the 2012 election, where absentee.vote.pct = (c1b+f1g)/f1a.
abvo <- read.csv("ABV2012.csv", header=T)
data$absentee <- abvo$absentee.transmit.pct[match(data$inputstate, abvo$State)]
rm(abvo);gc()

#### Respondent covariates
## Race and gender
data$race.white <- 1
data$race.white[data$race!="White"] <- 0
data$race.black <- 0
data$race.black[data$race=="Black"] <- 1
data$race.latino <- 0
data$race.latino[data$race=="Hispanic"] <- 1
data$female <- 1
data$female[data$gender=="Male"] <- 0

## Higher education
data$college <- 0
data$college[data$educ == "2-year"] <- 1
data$college[data$educ == "4-year"] <- 1
data$college[data$educ == "Post-grad"] <- 1

## Household income
data$hh.income <- 0
data$hh.income[data$faminc == "$10,000 - $19,999"] <- 1
data$hh.income[data$faminc == "$20,000 - $29,999"] <- 2
data$hh.income[data$faminc == "$30,000 - $39,999"] <- 3
data$hh.income[data$faminc == "$40,000 - $49,999"] <- 4
data$hh.income[data$faminc == "$50,000 - $59,999"] <- 5
data$hh.income[data$faminc == "$60,000 - $69,999"] <- 6
data$hh.income[data$faminc == "$70,000 - $79,999"] <- 7
data$hh.income[data$faminc == "$80,000 - $99,999"] <- 8
data$hh.income[data$faminc == "$100,000 - $119,999"] <- 9
data$hh.income[data$faminc == "$120,000 - $149,999"] <- 10
data$hh.income[data$faminc == "$150,000 - $199,999"] <- 11
data$hh.income[data$faminc == "$200,000 - $249,999"] <- 11
data$hh.income[data$faminc == "$250,000 - $349,999"] <- 11
data$hh.income[data$faminc == "$350,000 - $499,999"] <- 11
data$hh.income[data$faminc == "$150,000 or more "] <- 11
data$hh.income[data$faminc == "$500,000 or more"] <- 12
data$hh.income[is.na(data$faminc)] <- NA
data$hh.income[data$faminc == "Prefer not to say"] <- NA

## Reported ideology
data$conservative <- NA
data$conservative[data$ideo5 == "Very liberal"] <- 0
data$conservative[data$ideo5 == "Liberal"] <- 1
data$conservative[data$ideo5 == "Moderate"] <- 2
data$conservative[data$ideo5 == "Conservative"] <- 3
data$conservative[data$ideo5 == "Very Conservative"] <- 4
## Ideology as binary
data$conservative.dum <- 0
data$conservative.dum[data$conservative > 2] <- 1

## Partisan identification indicator
data$dems <- 0
data$dems[data$pid3 == "Democrat"] <- 1

#### Dependent variables
## Generate numeric treatment indicators for ictreg
data$fraud.treat <- 0
data$fraud.treat[data$treat=="A5, B4"] <- 1
data$buy.treat <- 0
data$buy.treat[data$treat=="A4, B5"] <- 1
## Count variables
data$q1a.count <- data$q1a #for compaitbility with older code written with factors
data$q1b.count <- data$q1b

#### HEADLINE RESULTS ####
## Treatment effect, voter impersonation: Full sample, unweighted, robust SEs
fit.1a <- lm(q1a.count~ as.factor(fraud.treat), data = data)
fraud <- coef(fit.1a)[2]
SEs.1a <- sqrt(diag(vcovHC(fit.1a)))
n <- length(resid(fit.1a))
CI.1a <- c(fraud - qt(0.975,df=(n-2))*SEs.1a[2], fraud + qt(0.975,df=(n-2))*SEs.1a[2])
CI.1a
## Treatment effect, voter impersonation: Full sample, weighted, robust SEs
fit.1a.w <- lm(q1a.count~ as.factor(fraud.treat), data = data, weights=weight)
fraud.w <- coef(fit.1a.w)[2]
SEs.1a.w <- sqrt(diag(vcovHC(fit.1a.w)))
n <- sum(fit.1a.w$weights)
CI.1a.w <- c(fraud.w - qt(0.975,df=(n-2))*SEs.1a.w[2], fraud.w + qt(0.975,df=(n-2))*SEs.1a.w[2])
CI.1a.w

### Subsets of the sample, impersonation only
## Treatment effect, impersonation: Contested states
fit.1a.cont <- lm(q1a.count~ as.factor(fraud.treat), data = subset(data, contested==1))
fraud.cont <- coef(fit.1a.cont)[2]
SEs.1a.cont <- sqrt(diag(vcovHC(fit.1a.cont)))
n <- length(resid(fit.1a.cont))
CI.1a.cont <- c(fraud.cont - qt(0.975,df=(n-2))*SEs.1a.cont[2],fraud.cont + qt(0.975,df=(n-2))*SEs.1a.cont[2])
CI.1a.cont
## Treatment effect, impersonation: Uncontested states
fit.1a.ncont <- lm(q1a.count~ as.factor(fraud.treat), data = subset(data, contested==0))
fraud.ncont <- coef(fit.1a.ncont)[2]
SEs.1a.ncont <- sqrt(diag(vcovHC(fit.1a.ncont)))
n <- length(resid(fit.1a.ncont))
CI.1a.ncont <- c(fraud.ncont - qt(0.975,df=(n-2))*SEs.1a.ncont[2], 
                 fraud.ncont + qt(0.975,df=(n-2))*SEs.1a.ncont[2])
CI.1a.ncont
## Treatment effect, impersonation: Contested seats with max contestation
fit.1a.cont.max <- lm(q1a.count~ as.factor(fraud.treat), data = subset(data, contested.max==1))
fraud.cont.max <- coef(fit.1a.cont.max)[2]
SEs.1a.cont.max <- sqrt(diag(vcovHC(fit.1a.cont.max)))
n <- length(resid(fit.1a.cont.max))
CI.1a.cont.max <- c(fraud.cont.max - qt(0.975,df=(n-2))*SEs.1a.cont.max[2],
                   fraud.cont.max + qt(0.975,df=(n-2))*SEs.1a.cont.max[2])
CI.1a.cont.max
## Treatment effect, impersonation: Uncontested seats with max contestation
fit.1a.ncont.max <- lm(q1a.count~ as.factor(fraud.treat), data = subset(data, contested.max==0))
fraud.ncont.max <- coef(fit.1a.ncont.max)[2]
SEs.1a.ncont.max <- sqrt(diag(vcovHC(fit.1a.ncont.max)))
n <- length(resid(fit.1a.ncont.max))
CI.1a.ncont.max <- c(fraud.ncont.max - qt(0.975,df=(n-2))*SEs.1a.ncont.max[2],
                     fraud.ncont.max + qt(0.975,df=(n-2))*SEs.1a.ncont.max[2])
CI.1a.ncont.max
## Treatment effect, impersonation: EDR
fit.1a.edr <- lm(q1a.count~ as.factor(fraud.treat), data = subset(data, edr==1))
fraud.edr <- coef(fit.1a.edr)[2]
SEs.1a.edr <- sqrt(diag(vcovHC(fit.1a.edr)))
n <- length(resid(fit.1a.edr))
CI.1a.edr <- c(fraud.edr - qt(0.975,df=(n-2))*SEs.1a.edr[2],fraud.edr + qt(0.975,df=(n-2))*SEs.1a.edr[2])
CI.1a.edr
## Treatment effect, impersonation: No EDR
fit.1a.nedr <- lm(q1a.count~ as.factor(fraud.treat), data = subset(data, edr==0))
fraud.nedr <- coef(fit.1a.nedr)[2]
SEs.1a.nedr <- sqrt(diag(vcovHC(fit.1a.nedr)))
n <- length(resid(fit.1a.edr))
CI.1a.nedr <- c(fraud.nedr - qt(0.975,df=(n-2))*SEs.1a.nedr[2], 
                fraud.nedr + qt(0.975,df=(n-2))*SEs.1a.nedr[2])
CI.1a.nedr
## Treatment effect, impersonation: Strict ID states
fit.1a.sid <- lm(q1a.count~ as.factor(fraud.treat), data = subset(data, strict.id==1))
fraud.sid <- coef(fit.1a.sid)[2]
SEs.1a.sid <- sqrt(diag(vcovHC(fit.1a.sid)))
n <- length(resid(fit.1a.sid))
CI.1a.sid <- c(fraud.sid - qt(0.975,df=(n-2))*SEs.1a.sid[2], fraud.sid + qt(0.975,df=(n-2))*SEs.1a.sid[2])
CI.1a.sid
## Treatment effect, impersonation: No strict ID states
fit.1a.nsid <- lm(q1a.count~ as.factor(fraud.treat), data = subset(data, strict.id==0))
fraud.nsid <- coef(fit.1a.nsid)[2]
SEs.1a.nsid <- sqrt(diag(vcovHC(fit.1a.nsid)))
n <- length(resid(fit.1a.nsid))
CI.1a.nsid <- c(fraud.nsid - qt(0.975,df=(n-2))*SEs.1a.nsid[2], 
                fraud.nsid + qt(0.975,df=(n-2))*SEs.1a.nsid[2])
CI.1a.nsid
## Treatment effect, impersonation: *Very* strict ID states
fit.1a.sid2 <- lm(q1a.count~ as.factor(fraud.treat), data = subset(data, state.voter.id==4))
fraud.sid2 <- coef(fit.1a.sid2)[2]
SEs.1a.sid2 <- sqrt(diag(vcovHC(fit.1a.sid2)))
n <- length(resid(fit.1a.sid2))
CI.1a.sid2 <- c(fraud.sid2 - qt(0.975,df=(n-2))*SEs.1a.sid2[2],
                fraud.sid2 + qt(0.975,df=(n-2))*SEs.1a.sid2[2])
CI.1a.sid2
## Treatment effect, impersonation: No *very* strict ID states
fit.1a.nsid2 <- lm(q1a.count~ as.factor(fraud.treat), data = subset(data, state.voter.id!=4))
fraud.nsid2 <- coef(fit.1a.nsid2)[2]
SEs.1a.nsid2 <- sqrt(diag(vcovHC(fit.1a.nsid2)))
n <- length(resid(fit.1a.nsid2))
CI.1a.nsid2 <- c(fraud.nsid2 - qt(0.975,df=(n-2))*SEs.1a.nsid2[2],
                 fraud.nsid2 + qt(0.975,df=(n-2))*SEs.1a.nsid2[2])
CI.1a.nsid2
## Treatment effect, impersonation: Female only
fit.1a.fem <- lm(q1a.count~ as.factor(fraud.treat), data = subset(data, female==1), weights=weight)
fraud.fem <- coef(fit.1a.fem)[2]
SEs.1a.fem <- sqrt(diag(vcovHC(fit.1a.fem)))
n <- length(resid(fit.1a.fem))
CI.1a.fem <- c(fraud.fem - qt(0.975,df=(n-2))*SEs.1a.fem[2], fraud.fem + qt(0.975,df=(n-2))*SEs.1a.fem[2])
CI.1a.fem
## Treatment effect, impersonation: Male only
fit.1a.nfem <- lm(q1a.count~ as.factor(fraud.treat), data = subset(data, female==0), weights=weight)
fraud.nfem <- coef(fit.1a.nfem)[2]
SEs.1a.nfem <- sqrt(diag(vcovHC(fit.1a.nfem)))
n <- length(resid(fit.1a.nfem))
CI.1a.nfem <- c(fraud.nfem - qt(0.975,df=(n-2))*SEs.1a.nfem[2], 
                fraud.nfem + qt(0.975,df=(n-2))*SEs.1a.nfem[2])
CI.1a.nfem
## Treatment effect, impersonation: White only
fit.1a.white <- lm(q1a.count~ as.factor(fraud.treat), data = subset(data, race.white==1), weights=weight)
fraud.white <- coef(fit.1a.white)[2]
SEs.1a.white <- sqrt(diag(vcovHC(fit.1a.white)))
n <- length(resid(fit.1a.white))
CI.1a.white <- c(fraud.white - qt(0.975,df=(n-2))*SEs.1a.white[2], 
                 fraud.white + qt(0.975,df=(n-2))*SEs.1a.white[2])
CI.1a.white
## Treatment effect, impersonation: Non-white only
fit.1a.nwhite <- lm(q1a.count~ as.factor(fraud.treat), data = subset(data, race.white==0), weights=weight)
fraud.nwhite <- coef(fit.1a.nwhite)[2]
SEs.1a.nwhite <- sqrt(diag(vcovHC(fit.1a.nwhite)))
n <- length(resid(fit.1a.nwhite))
CI.1a.nwhite <- c(fraud.nwhite - qt(0.975,df=(n-2))*SEs.1a.nwhite[2], 
                  fraud.nwhite + qt(0.975,df=(n-2))*SEs.1a.nwhite[2])
CI.1a.nwhite
## Treatment effect, impersonation: Democrats only
fit.1a.dem <- lm(q1a.count~ as.factor(fraud.treat), data = subset(data, dems==1), weights=weight)
fraud.dem <- coef(fit.1a.dem)[2]
SEs.1a.dem <- sqrt(diag(vcovHC(fit.1a.dem)))
n <- length(resid(fit.1a.dem))
CI.1a.dem <- c(fraud.dem - qt(0.975,df=(n-2))*SEs.1a.dem[2], fraud.dem + qt(0.975,df=(n-2))*SEs.1a.dem[2])
CI.1a.dem
## Treatment effect, impersonation: Non-Democrats only
fit.1a.ndem <- lm(q1a.count~ as.factor(fraud.treat), data = subset(data, dems==0), weights=weight)
fraud.ndem <- coef(fit.1a.ndem)[2]
SEs.1a.ndem <- sqrt(diag(vcovHC(fit.1a.ndem)))
n <- length(resid(fit.1a.ndem))
CI.1a.ndem <- c(fraud.ndem - qt(0.975,df=(n-2))*SEs.1a.ndem[2], 
                fraud.ndem + qt(0.975,df=(n-2))*SEs.1a.ndem[2])
CI.1a.ndem


#### FIGURE 1 ####
plot.data <- data.frame(
                ques=c("male \n (weighted)", "female \n (weighted)", "non-Democrat \n (weighted)", "Democrat \n (weighted)", "non-white \n (weighted)", "white \n (weighted)",
                  "non-strict", "strict ID states", "no EDR", "EDR states", "uncontested", "contested states",
                  "full sample \n (weighted)","full sample"),
                w.stat=c("weighted", "weighted", "weighted", "weighted", "weighted", "weighted", "unweighted", "unweighted", "unweighted",
                  "unweighted", "unweighted", "unweighted", "weighted", "unweighted"),
                y=c(fraud.nfem, fraud.fem, fraud.ndem, fraud.dem, fraud.nwhite, fraud.white, fraud.nsid, fraud.sid, fraud.nedr,
                  fraud.edr, fraud.ncont.max, fraud.cont.max, fraud.w, fraud),
                lower=c(CI.1a.nfem[1], CI.1a.fem[1], CI.1a.ndem[1], CI.1a.dem[1], CI.1a.nwhite[1], CI.1a.white[1], CI.1a.nsid[1], CI.1a.sid[1], 
                        CI.1a.nedr[1], CI.1a.edr[1], CI.1a.ncont.max[1], CI.1a.cont.max[1], CI.1a.w[1], CI.1a[1]),
                upper=c(CI.1a.nfem[2], CI.1a.fem[2], CI.1a.ndem[2], CI.1a.dem[2], CI.1a.nwhite[2], CI.1a.white[2], CI.1a.nsid[2], CI.1a.sid[2], 
                        CI.1a.nedr[2], CI.1a.edr[2], CI.1a.ncont.max[2], CI.1a.cont.max[2], CI.1a.w[2], CI.1a[2])
                     )
plot.data$ques <- factor(plot.data$ques, as.character(plot.data$ques))
lims <- aes(ymax = upper, ymin=lower)
mytitle <- "Voter impersonation in the 2012 election\n relevent sub-groups of Dec 2012 national sample"
fig1 <- ggplot(plot.data, aes(y=y, x=ques, group = w.stat, color=w.stat))
pdf("Figure_1.pdf")
fig1 + geom_point(size=4) + geom_pointrange(lims, width=.2, size=1.25 ) +
  labs(x = NULL, title = mytitle, y = "proportion of voters\n(difference in means)", color="") +
  theme_bw() + ylim(-.8,.8) + geom_hline(yintercept=0,linetype="dashed") +theme(legend.position="none") +
  coord_flip()
dev.off()
rm(lims,mytitle,n,plot.data,fig1,CI.1a.cont,CI.1a.cont.max,CI.1a.dem,CI.1a.edr,CI.1a.fem,CI.1a.ncont,
   CI.1a.ncont.max,CI.1a.ndem,CI.1a.nedr,CI.1a.nfem,CI.1a.nsid,CI.1a.nsid2,CI.1a.nwhite,CI.1a.sid,
   CI.1a.sid2,CI.1a.white,fit.1a.cont,fit.1a.cont.max,fit.1a.dem,fit.1a.edr,fit.1a.fem,
   fit.1a.ncont,fit.1a.ncont.max,fit.1a.ndem,fit.1a.nedr,fit.1a.nfem,fit.1a.nsid,fit.1a.nsid2,
   fit.1a.nwhite,fit.1a.sid,fit.1a.sid2,fit.1a.white,fraud.cont,fraud.cont.max,fraud.dem,fraud.edr,
   fraud.fem,fraud.ncont,fraud.ncont.max,fraud.ndem,fraud.nedr,fraud.nfem,fraud.nsid,fraud.nsid2,
   fraud.nwhite,fraud.sid,fraud.sid2,fraud.white,SEs.1a.cont,SEs.1a.cont.max,SEs.1a.dem,SEs.1a.edr,
   SEs.1a.fem,SEs.1a.ncont,SEs.1a.ncont.max,SEs.1a.ndem,SEs.1a.nedr,SEs.1a.nfem,SEs.1a.nsid,SEs.1a.nsid2,
   SEs.1a.nwhite,SEs.1a.sid,SEs.1a.sid2,SEs.1a.white); gc()


#### ICT REGRESSION RESULTS ####
## Model 1, impersonation only
ict.fit1a.base <- ictreg(q1a.count ~ strict.id + edr + absentee + contested.max +  race.white + female  +
                       conservative.dum + college, treat = "fraud.treat", J=4, data = data) 
summary(ict.fit1a.base)
## Model 2, imputing for missingness
imp.x <- data[,c("q1a.count", "fraud.treat", "strict.id", "edr", "absentee", "contested.max", "hh.income",
               "race.white", "race.black", "race.latino", "female", "conservative.dum", "college", 
               "pp_ownorrent", "pp_educ")]
imp.bounds <- matrix(c(1,0,5,7,0,12), ncol=3, byrow=T) #bounds for q1a.count and hh.income
imp.out <- amelia(imp.x, m=20, noms = c("pp_ownorrent", "pp_educ"), bounds=imp.bounds)
b.out <- se.out <- bic.out <- predict.out <- predict.se <- NULL
for(i in 1:20){
	imp.out$imputations[[i]]$q1a.count <- round(imp.out$imputations[[i]]$q1a.count)
	ict.out <- ictreg(q1a.count ~ strict.id + edr + absentee + contested.max + race.white + female +
                       conservative.dum + college + hh.income, treat = "fraud.treat", J=4, data = imp.out$imputations[[i]])
	bic.temp <- ict.bic(ict.out)
	preds <- predict(ict.out, se.fit=T)
	b.out <- rbind(b.out, coef(ict.out))
	se.out <- rbind(se.out, sqrt(diag(vcov(ict.out))))
	predict.out <- rbind(predict.out,preds[[1]])
	predict.se <- rbind(predict.se, preds[[2]])
	bic.out <- rbind(bic.out, bic.temp)
}
ict.mi.out <- mi.meld(q=b.out, se = se.out)
ict.mi.hat <- mi.meld(q=predict.out, se = predict.se)

#### TABLE 2 ####
cbind(c(summary(ict.fit1a.base)$par.treat,NA,summary(ict.fit1a.base)$par.control,NA),
      c(summary(ict.fit1a.base)$se.treat,NA,summary(ict.fit1a.base)$se.control,NA),
      t(ict.mi.out[[1]]),t(ict.mi.out[[2]]))
rm(b.out,bic.out,imp.bounds,imp.x,se.out,bic.temp,i,ict.mi.out,ict.out,imp.out,
   preds,predict.out,predict.se); gc()

#### WAVE 2 DATA ####
w2.data <- read.spss(file="AMJ_wave_2.sav",to.data.frame=TRUE)
## Merge together responses across randomized question ordering
QA <- w2.data$Qa
QA[is.na(w2.data$Qa)] <- w2.data$Qa2[is.na(w2.data$Qa)]
QA <- as.numeric(QA)-1
QB <- w2.data$Qb
QB[is.na(w2.data$Qb)] <- w2.data$Qb2[is.na(w2.data$Qb)]
QB <- as.numeric(QB)-1
QC <- w2.data$Qc
QC[is.na(w2.data$Qc)] <- w2.data$Qc2[is.na(w2.data$Qc)]
QC <- as.numeric(QC)-1
QD <- w2.data$Qd
QD[is.na(w2.data$Qd)] <- w2.data$Qd2[is.na(w2.data$Qd)]
QD <- as.numeric(QD)-1
vf.treat <- w2.data$group=="Group 1 - 5A, 4B, 4C, 5D"
vb.treat <- 1-vf.treat
w2.data <- cbind(w2.data,vf.treat,vb.treat, QA, QB, QC, QD)


#### RESULTS, WAVE 2 #### 
## Difference in means, voter impersonation, unweighted
vf.fit <- lm(QA~vf.treat)
fraud2 <- coef(vf.fit)[2]
vf.se <- sqrt(diag(vcovHC(vf.fit)))
n <- length(resid(vf.fit))
CI.vf <- c(fraud2 - qt(0.975,df=(n-2))*vf.se[2],
         fraud2 + qt(0.975,df=(n-2))*vf.se[2])
## Difference in means, voter impersonation, weighted
vf.fit.w <- lm(QA~vf.treat, weights = w2.data$weight, data=w2.data)
vf.w.se <- sqrt(diag(vcovHC(vf.fit.w)))
fraud2.w <- coef(vf.fit.w)[2]
n <- length(resid(vf.fit.w))
CI.vf.w <- c(fraud2.w - qt(0.975,df=(n-2))*vf.w.se[2],
         fraud2.w + qt(0.975,df=(n-2))*vf.w.se[2])
## Difference in means, alien abduction, unweighted
alien.fit <- lm(QD~vf.treat, data=w2.data)
alien.se <- sqrt(diag(vcovHC(alien.fit)))
alien <- coef(alien.fit)[2]
n <- length(resid(alien.fit))
CI.alien <- c(alien - qt(0.975,df=(n-2))*alien.se[2],
         alien + qt(0.975,df=(n-2))*alien.se[2])
# Difference in means, alien abduction, weighted
alien.fit.w <- lm(QD~vf.treat, weights=w2.data$weight, data=w2.data)
alien.w.se <- sqrt(diag(vcovHC(alien.fit.w)))
alien.w <- coef(alien.fit.w)[2]
CI.alien.w <- c(alien.w - qt(0.975,df=(n-2))*alien.w.se[2],
         alien.w + qt(0.975,df=(n-2))*alien.w.se[2])

#### FIGURE 2 ####
plot.data <- data.frame(
                     ques=c("impersonation", "impersonation \n (weighted)",
                      #"vote buying", "vote buying \n (weighted)",
                      "alien abduction", "alien abduction \n (weighted)"),
                     ques1=c("fraud", "fraud", #"vote buying", "vote buying",
                     	"abduction", "abduction"),
                     w.stat=c("unweighted", "weighted",#"unweighted", "weighted",
                     	"unweighted", "weighted"),
                     y=c(fraud2, fraud2.w, alien, alien.w),
                     lower=c(CI.vf[1], CI.vf.w[1],CI.alien[1],CI.alien.w[1]),
                     upper=c(CI.vf[2], CI.vf.w[2],CI.alien[2],CI.alien.w[2])
                     )

new.data <- data.frame(ques=c("impersonation", "impersonation \n (weighted)"),
                      #"vote buying", "vote buying \n (weighted)"),
                     ques1=c("fraud", "fraud"),
                     w.stat=c("unweighted", "weighted"),
                     y=c(fraud, fraud.w)
	)

lims <- aes(ymax = upper, ymin=lower)
mytitle <- "Irregular voting behavior & alien abduction \n Sept. 2013 national sample"
fig2 <- ggplot(plot.data, aes(y=y, x=ques, group = w.stat, shape=ques1, color=w.stat))
pdf("Figure_2.pdf")
	fig2 + geom_point(size=4) + geom_pointrange(lims, width=.2, size=1.25 ) +
	  labs(x = NULL, title = mytitle, y = "proportion of voters (difference in means)", color="") +
	  theme_bw() + ylim(-.25,.25) + geom_hline(yintercept=0,linetype="dashed") + theme(legend.position="none") +
	  geom_point(data= new.data, size = 4, shape = c(2,2), alpha=I(0.9)) + coord_flip()     
dev.off()
rm(plot.data,fig2,lims,mytitle,new.data,n,alien,alien.fit,alien.se,alien.w,alien.w.se,
   CI.alien,CI.alien.w,CI.vf,CI.vf.w,vf.fit,vf.treat,vf.se,vf.w.se)

#### Who said "5" for fraud and abduction? ####
table(QA)
table(QD)
abductees <- which(w2.data$QD==5)
fraudsters <- which(w2.data$QA==5)
length(intersect(abductees,fraudsters)) 
## 9 respondents, about 1/5 of those admitting fraud were also abducted
## Lower bound if we accept all 41 reporting fraud is ~ 2.5% 
length(fraudsters)/nrow(w2.data)


#### POWER CALCULATION ####
## Calculating the sample size necessary to detect a difference in 0.01 with 80% power (see Fn 31).
power.t.test(n = NULL, delta = .01, sd = 1.03, sig.level = 0.05, power = 0.80,
             alternative = "one.sided", type = "two.sample")


#### SUPPLEMENTARY ANALYSIS AND RESULTS NOT REPORTED IN PAPER ####
## Wave 1: Means across treatment/control group
tapply(data$q1a.count,data$treat,mean,na.rm=TRUE)
tapply(data$q1b.count,data$treat,mean,na.rm=TRUE)
## Summary of responses to list experiment
prop.table(table(data$q1a,data$treat,exclude=NULL),2)*100
prop.table(table(data$q1b,data$treat,exclude=NULL),2)*100
## Summary of responses to list experiment, cumulative
tab1a <- t(table(data$q1a,data$fraud.treat))
tab1a[1,] <- tab1a[1,][order(colnames(tab1a),decreasing = T)]
tab1a[2,] <- tab1a[2,][order(colnames(tab1a),decreasing = T)]
colnames(tab1a) <- order(colnames(tab1a),decreasing = T) - 1
cum.tab <- apply(tab1a,1,cumsum)
cum.tab[,1] <- cum.tab[,1]/max(cum.tab[,1])
cum.tab[,2] <- cum.tab[,2]/max(cum.tab[,2])
cum.tab

## Treatment effect, vote buying: Full sample, unweighted, robust SEs
fit.1b <- lm(q1b.count~ as.factor(treat), data = data)
bought <- -coef(fit.1b)[2]
SEs.1b <- sqrt(diag(vcovHC(fit.1b)))
n <- length(resid(fit.1b))
CI.1b <- c(bought - qt(0.975,df=(n-2))*SEs.1b[2], bought + qt(0.975,df=(n-2))*SEs.1b[2])
CI.1b
## Treatment effect, vote buying: Full sample, weighted, robust SEs
fit.1b.w <- lm(q1b.count~ as.factor(treat), data = data, weights=weight)
bought.w <- -coef(fit.1b.w)[2]
SEs.1b.w <- sqrt(diag(vcovHC(fit.1b.w)))
n <- sum(fit.1b.w$weights)
CI.1b.w <- c(bought.w - qt(0.975,df=(n-2))*SEs.1b.w[2],
             bought.w + qt(0.975,df=(n-2))*SEs.1b.w[2])
CI.1b.w

## Wave 2
## Difference in means, texting, unweighted
sms.fit <- lm(QC~vb.treat)
sms.se <- sqrt(diag(vcovHC(sms.fit)))
sms <- coef(sms.fit)[2]
n <- length(resid(sms.fit))
CI.sms <- c(sms - qt(0.975,df=(n-2))*sms.se[2],
          sms + qt(0.975,df=(n-2))*sms.se[2])
## Difference in means, texting, weighted
sms.fit.w <- lm(QC~vb.treat, data = w2.data, weights=w2.data$weight)
sms.w.se <- sqrt(diag(vcovHC(sms.fit.w)))
sms.w <- coef(sms.fit.w)[2]
CI.sms.w <- c(sms.w - qt(0.975,df=(n-2))*sms.w.se[2],
            sms.w + qt(0.975,df=(n-2))*sms.w.se[2])


#### ROBUSTNESS ####
## WAVE 1
## Introducing a ceiling. See ?ictreg.
ict.fit1a.ceil <- ictreg(q1a.count ~ strict.id + edr + absentee + contested.max +  race.white + female  +
                         conservative.dum + college, treat = "fraud.treat", J=4, data = data,
                       ceiling = T, ceiling.fit = "bayesglm") 
summary(ict.fit1a.ceil) 
## Household income model without imputation
ict.fit1a.inc <- ictreg(q1a.count ~ strict.id + edr + absentee + contested.max + hh.income + race.white + female +
                        conservative.dum + college, treat = "fraud.treat", J=4, data = data) 
summary(ict.fit1a.inc) 
## Checking for design effect. See ?ict.test. No evidence of design effect.
test.dat <- na.omit(cbind(data$q1a.count,data$fraud.treat))
ict.test(y = test.dat[,1], treat = test.dat[,2], J = 4)
test.dat <- na.omit(cbind(data$q1b.count,data$buy.treat ))
ict.test(y = test.dat[,1], treat = test.dat[,2], J = 4)

## WAVE 2
## Checking question order randomization; difference not distingishable from 0
vf.fit.w1 <- lm((as.numeric(Qa)-1)~vf.treat, weights = w2.data$weight, data=w2.data)
vf.w1.se <- sqrt(diag(vcovHC(vf.fit.w1)))
summary(vf.fit.w1)
## Looking at only registered voters
vf.fit.w.rv <- update(vf.fit.w,.~., subset=w2.data$votereg=="Yes")
#looking at only registered voters
alien.fit.w.rv <- update(alien.fit.w,.~., subset=w2.data$votereg=="Yes")
## Checking for design effect. See ?ict.test. No evidence of design effect.
ict.test(y=w2.data$QA, treat= w2.data$vf.treat, J = 4)
ict.test(y=w2.data$QB, treat= w2.data$vb.treat, J = 4)
ict.test(y=w2.data$QC, treat= w2.data$vb.treat, J = 4)
ict.test(y=w2.data$QD, treat= w2.data$vf.treat, J = 4)
## Work with registered voters only
rv.w2.data <- subset(w2.data, votereg=="Yes")
table(rv.w2.data$QA)
table(rv.w2.data$QD)
abductees.rv <- which(rv.w2.data$QD==5)
fraudsters.rv <- which(rv.w2.data$QA==5)
length(intersect(abductees.rv,fraudsters.rv)) 
# 7 of those admitting to fraud were also abducted. Similarly implausible as with all respondents

#### PREDICTED PREVALENCIES ####
## Get weights
fit1a.base.weight <- data$weight[match(rownames(ict.fit1a.base$x),rownames(data))]
## Get predicted probability of a positive response to the sensitive item 
p.1a.base <- predict(ict.fit1a.base, se.fit=T)
## Weight it to national sample
pop.fraud.base <- sum(p.1a.base$fit*fit1a.base.weight)/sum(fit1a.base.weight)
## Get the standard deviation of the weighted mean
pop.fraud.base.se <- sqrt(sum(fit1a.base.weight*((p.1a.base$fit-pop.fraud.base)^2))*
                          (sum(fit1a.base.weight)/(sum(fit1a.base.weight)^2 - sum(fit1a.base.weight^2))))
## Same for the income model
fit1a.inc.weight <- data$weight[match(rownames(ict.fit1a.inc$x),rownames(data))]
p.1a.inc <- predict(ict.fit1a.inc, se.fit=T)
pop.fraud.inc <- sum(p.1a.inc$fit*fit1a.inc.weight)/sum(fit1a.inc.weight)
pop.fraud.inc.se <- sqrt(sum(fit1a.inc.weight*((p.1a.inc$fit-pop.fraud.inc)^2))*
                         (sum(fit1a.inc.weight)/(sum(fit1a.inc.weight)^2 - sum(fit1a.inc.weight^2))))
## Same for imputed model
pop.fraud.impute <- sum(ict.mi.hat$q.mi*data$weight)/sum(data$weight)
pop.fraud.impute.se <- sqrt(sum(data$weight*((ict.mi.hat$q.mi-pop.fraud.impute)^2))*
                            (sum(data$weight)/(sum(data$weight)^2 - sum(data$weight^2))))

plot.data <- data.frame(
  mod=c("Model 1", "Model 2"),
  w.stat=c("weighted", "weighted"),
  y=c(pop.fraud.base, pop.fraud.impute),
  lower=c(pop.fraud.base - 2*pop.fraud.base.se,
          pop.fraud.impute - 2*pop.fraud.impute.se),
  upper=c(pop.fraud.base + 2*pop.fraud.base.se,
          pop.fraud.impute + 2*pop.fraud.impute.se)
)
plot.data$mod <- factor(plot.data$mod, as.character(plot.data$mod))
lims <- aes(ymax = upper, ymin=lower)
mytitle <- "Predicted prevalence of voter impersonation, 2012 election\n weighted data"
supfig1 <- ggplot(plot.data, aes(y=y, x=mod, group = w.stat, color=w.stat))
pdf("Supp_Figure_1.pdf")
supfig1 + geom_point(size=4) + geom_pointrange(lims, width=.2, size=1.25 ) +
  labs(x = NULL, title = mytitle, y = "proportion of voters", color="") +
  theme_bw() + geom_hline(yintercept=0,linetype="dashed") +theme(legend.position="none") +
  coord_flip()+ scale_color_hue(h.start=180)
dev.off()


#### SUPPLEMENTARY PLOTS ####
## Treatment effect, impersonation
plot.data <- data.frame(
  ques=c("impersonation", "impersonation \n (weighted)"),
  ques1=c("fraud", "fraud"),
  w.stat=c("unweighted", "weighted"),
  y=c(fraud, fraud.w),
  lower=c(CI.1a[1], CI.1a.w[1]),
  upper=c(CI.1a[2], CI.1a.w[2])
)
lims <- aes(ymax = upper, ymin=lower)
mytitle <- "Voter impersonation in the 2012 election\n national sample"
suppfig2 <- ggplot(plot.data, aes(y=y, x=ques, group = w.stat, shape=ques1, color=w.stat))
pdf("Supp_Figure_2.pdf")
suppfig2 + geom_point(size=4) + geom_errorbar(lims, width=.2, size=1.25 ) +
  labs(x = NULL, title = mytitle, y = "proportion of voters (difference in means)", color="") +
  theme_bw() + ylim(-.25,.25) + geom_hline(yintercept=0,linetype="dashed") +theme(legend.position="none")
dev.off()

## Treatment effect, vote buying
plot.data <- data.frame(
  ques=c("vote buying", "vote buying \n (weighted)"),
  ques1=c("buy", "buy"),
  w.stat=c("unweighted", "weighted"),
  y=c(bought, bought.w),
  lower=c(CI.1b[1], CI.1b.w[1]),
  upper=c(CI.1b[2], CI.1b.w[2])
)
lims <- aes(ymax = upper, ymin=lower)
mytitle <- "Vote buying in the 2012 election\n national sample"
suppfig3 <- ggplot(plot.data, aes(y=y, x=ques, group = w.stat, shape=ques1, color=w.stat))
pdf("Supp_Figure_3.pdf")
suppfig3 + geom_point(size=4) + geom_errorbar(lims, width=.2, size=1.25 ) +
  labs(x = NULL, title = mytitle, y = "proportion of voters (difference in means)", color="") +
  theme_bw() + ylim(-.25,.25) + geom_hline(yintercept=0,linetype="dashed") +theme(legend.position="none")
dev.off()

## Treatment effect, figure impersonation and vote buying
plot.data <- data.frame(
  ques=c("impersonation", "impersonation \n (weighted)", "vote buying", "vote buying \n (weighted)"),
  ques1=c("fraud", "fraud", "vote buying", "vote buying"),
  w.stat=c("unweighted", "weighted","unweighted", "weighted"),
  y=c(fraud, fraud.w, bought, bought.w),
  lower=c(CI.1a[1], CI.1a.w[1], CI.1b[1], CI.1b.w[1]),
  upper=c(CI.1a[2], CI.1a.w[2], CI.1b[2], CI.1b.w[2])
)
lims <- aes(ymax = upper, ymin=lower)
mytitle <- "Irregular voting behavior in the 2012 election\n Dec. 2012 national sample"
suppfig4 <- ggplot(plot.data, aes(y=y, x=ques, group = w.stat, shape=ques1, color=w.stat))
pdf("Supp_Figure_4.pdf")
suppfig4 + geom_point(size=4) + geom_pointrange(lims, width=.2, size=1.25 ) +
  labs(x = NULL, title = mytitle, y = "proportion of voters (difference in means)", color="") +
  theme_bw() + ylim(-.25,.25) + geom_hline(yintercept=0,linetype="dashed") +theme(legend.position="none") + coord_flip()
dev.off()
